# Import necessary packages
# For basic operations
import numpy as np
import pandas as pd
from re import search
import datetime, warnings, scipy
import time
# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# For data scaling
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# For dimensionality reduction
!pip install prince
from prince import FAMD
# For addressing imbalance
!pip install imblearn
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from numpy import arange
# For cross validation
from sklearn.model_selection import StratifiedKFold
# For model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, mean_squared_error
from sklearn import metrics
Requirement already satisfied: prince in ./Documents/anaconda3/lib/python3.8/site-packages (0.7.1) Requirement already satisfied: scikit-learn>=0.22.1 in ./Documents/anaconda3/lib/python3.8/site-packages (from prince) (0.24.1) Requirement already satisfied: matplotlib>=3.0.2 in ./Documents/anaconda3/lib/python3.8/site-packages (from prince) (3.3.4) Requirement already satisfied: pandas>=1.0.3 in ./Documents/anaconda3/lib/python3.8/site-packages (from prince) (1.2.4) Requirement already satisfied: scipy>=1.3.0 in ./Documents/anaconda3/lib/python3.8/site-packages (from prince) (1.6.2) Requirement already satisfied: numpy>=1.17.1 in ./Documents/anaconda3/lib/python3.8/site-packages (from prince) (1.20.1) Requirement already satisfied: kiwisolver>=1.0.1 in ./Documents/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.0.2->prince) (1.3.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in ./Documents/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.0.2->prince) (2.4.7) Requirement already satisfied: pillow>=6.2.0 in ./Documents/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.0.2->prince) (8.2.0) Requirement already satisfied: python-dateutil>=2.1 in ./Documents/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.0.2->prince) (2.8.1) Requirement already satisfied: cycler>=0.10 in ./Documents/anaconda3/lib/python3.8/site-packages (from matplotlib>=3.0.2->prince) (0.10.0) Requirement already satisfied: six in ./Documents/anaconda3/lib/python3.8/site-packages (from cycler>=0.10->matplotlib>=3.0.2->prince) (1.15.0) Requirement already satisfied: pytz>=2017.3 in ./Documents/anaconda3/lib/python3.8/site-packages (from pandas>=1.0.3->prince) (2021.1) Requirement already satisfied: threadpoolctl>=2.0.0 in ./Documents/anaconda3/lib/python3.8/site-packages (from scikit-learn>=0.22.1->prince) (2.1.0) Requirement already satisfied: joblib>=0.11 in ./Documents/anaconda3/lib/python3.8/site-packages (from scikit-learn>=0.22.1->prince) (1.0.1) Requirement already satisfied: imblearn in ./Documents/anaconda3/lib/python3.8/site-packages (0.0) Requirement already satisfied: imbalanced-learn in ./Documents/anaconda3/lib/python3.8/site-packages (from imblearn) (0.8.1) Requirement already satisfied: joblib>=0.11 in ./Documents/anaconda3/lib/python3.8/site-packages (from imbalanced-learn->imblearn) (1.0.1) Requirement already satisfied: numpy>=1.13.3 in ./Documents/anaconda3/lib/python3.8/site-packages (from imbalanced-learn->imblearn) (1.20.1) Requirement already satisfied: scipy>=0.19.1 in ./Documents/anaconda3/lib/python3.8/site-packages (from imbalanced-learn->imblearn) (1.6.2) Requirement already satisfied: scikit-learn>=0.24 in ./Documents/anaconda3/lib/python3.8/site-packages (from imbalanced-learn->imblearn) (0.24.1) Requirement already satisfied: threadpoolctl>=2.0.0 in ./Documents/anaconda3/lib/python3.8/site-packages (from scikit-learn>=0.24->imbalanced-learn->imblearn) (2.1.0)
# Load dataset
flight_df = pd.read_csv('~/flights_2008.csv')
# Drop unnecessary columns
flight_df = flight_df.drop(columns={'Unnamed: 0'})
# Create a function to calculate the percentage of null values in the dataframe
def check_missing_values(df, df_name):
print(f'Percentage of null values in {df_name}')
print('---------------------------------------------')
for i, col in df.items():
count_null = col.isnull()
percentage_null = count_null.sum() * 100.0 / np.shape(df)[0]
print(f'Percentage of null values in column {i} = {percentage_null}%')
# Check null values in flight dataframe
check_missing_values(flight_df, "Flight Dataframe")
Percentage of null values in Flight Dataframe --------------------------------------------- Percentage of null values in column Year = 0.0% Percentage of null values in column Month = 0.0% Percentage of null values in column DayofMonth = 0.0% Percentage of null values in column DayOfWeek = 0.0% Percentage of null values in column DepTime = 0.0% Percentage of null values in column CRSDepTime = 0.0% Percentage of null values in column ArrTime = 0.3671083325846595% Percentage of null values in column CRSArrTime = 0.0% Percentage of null values in column UniqueCarrier = 0.0% Percentage of null values in column FlightNum = 0.0% Percentage of null values in column TailNum = 0.0002581633843773977% Percentage of null values in column ActualElapsedTime = 0.4330432609546469% Percentage of null values in column CRSElapsedTime = 0.010223270021344948% Percentage of null values in column AirTime = 0.4330432609546469% Percentage of null values in column ArrDelay = 0.4330432609546469% Percentage of null values in column DepDelay = 0.0% Percentage of null values in column Origin = 0.0% Percentage of null values in column Dest = 0.0% Percentage of null values in column Distance = 0.0% Percentage of null values in column TaxiIn = 0.3671083325846595% Percentage of null values in column TaxiOut = 0.02349286797834319% Percentage of null values in column Cancelled = 0.0% Percentage of null values in column CancellationCode = 0.0% Percentage of null values in column Diverted = 0.0% Percentage of null values in column CarrierDelay = 35.588855189961784% Percentage of null values in column WeatherDelay = 35.588855189961784% Percentage of null values in column NASDelay = 35.588855189961784% Percentage of null values in column SecurityDelay = 35.588855189961784% Percentage of null values in column LateAircraftDelay = 35.588855189961784%
# Create a function to transform the data from float to integer
def floatTime_to_integer(df):
int_list = []
for i, col in df.items():
if str(col) != 'nan':
int_list.append(int(col))
else:
int_list.append(np.nan)
return int_list
# Change float to int for the following variables
flight_df['DepTime'] = floatTime_to_integer(flight_df['DepTime'])
flight_df['CRSDepTime'] = floatTime_to_integer(flight_df['CRSDepTime'])
flight_df['ArrTime'] = floatTime_to_integer(flight_df['ArrTime'])
flight_df['CRSArrTime'] = floatTime_to_integer(flight_df['CRSArrTime'])
# Create a function to transform the data from integer to string
def integerTime_to_string(df):
clean_time_data = []
for i, col in df.items():
if str(col) != 'nan' and search('.0', str(col)): # For all values that still have decimals in it
col_int = int(col) # Remove the decimal before transforming into strings
if len(str(col_int)) == 4: # For all 4-digit values
clean_time_data.append(str(col_int)[:2] + "." + str(col_int)[2:4]) # Slip a dot . in between the integers
elif len(str(col_int)) == 3: # For all 3-digit values (Ex: `854` is supposed to represent `08.54 AM`)
col_4_digit = "0" + str(col_int) # Append a zero as prefix to make it 4-digit
clean_time_data.append(str(col_4_digit)[:2] + '.' + str(col_4_digit)[2:4]) # Slip a dot . in between the integers
else:
clean_time_data.append('Incorrect format') # All values that don't follow the rules will be flagged as `Incorrect format`
# The following function will apply for values with no decimals
elif str(col) != len(str(col)) == 4: # For all 4-digit values
clean_time_data.append(str(col)[:2] + "." + str(col)[2:4]) # Slip a dot . in between the integers
elif str(col) != 'nan' and len(str(col)) == 3: # For all 3-digit values (Ex: `854` is supposed to interpreted as `08.54 AM`)
col_4_digit = "0" + str(col) # Append a zero as prefix to make it 4-digit
clean_time_data.append(str(col_4_digit)[:2] + '.' + str(col_4_digit)[2:4]) # Slip a dot . in between the integers
elif str(col) != 'nan' and len(str(col)) < 3: # Ex: `9` or `27` can't be intrepreted because we don't know which one is the hour and the minute
clean_time_data.append('Incorrect format') # All values that don't follow the rules will be flagged as `Incorrect format`
elif str(col) == 'nan': # For null values
clean_time_data.append(np.nan) # Keep it null for now
return clean_time_data
# Change integer to string and create as new variables
flight_df['clean_DepTime'] = integerTime_to_string(flight_df['DepTime'])
flight_df['clean_CRSDepTime'] = integerTime_to_string(flight_df['CRSDepTime'])
flight_df['clean_ArrTime'] = integerTime_to_string(flight_df['ArrTime'])
flight_df['clean_CRSArrTime'] =integerTime_to_string(flight_df['CRSArrTime'])
# Remove rows with incorrect time format
flight_df = flight_df[(flight_df['clean_DepTime']!='Incorrect format') & (flight_df['clean_ArrTime']!='Incorrect format') & (flight_df['clean_CRSDepTime']!='Incorrect format') & (flight_df['clean_CRSArrTime']!='Incorrect format')]
flight_df = flight_df.drop(columns={'DepTime','ArrTime','CRSDepTime','CRSArrTime'})
# Change 24.00 to 23.59
flight_df['clean_DepTime'] = flight_df['clean_DepTime'].replace('24.00','23.59')
flight_df['clean_CRSDepTime'] = flight_df['clean_CRSDepTime'].replace('24.00','23.59')
flight_df['clean_ArrTime'] = flight_df['clean_ArrTime'].replace('24.00','23.59')
flight_df['clean_CRSArrTime'] = flight_df['clean_CRSArrTime'].replace('24.00','23.59')
# Create a function to transform the data from string to datetime
def stringTime_to_datetime(df):
result = []
for i, col in df.items():
if str(col) != 'nan':
result.append(datetime.time(int(col[0:2]), int(col[3:5])))
else:
result.append(np.nan)
return result
flight_df['clean_DepTime'] = stringTime_to_datetime(flight_df['clean_DepTime'])
flight_df['clean_CRSDepTime'] = stringTime_to_datetime(flight_df['clean_CRSDepTime'])
flight_df['clean_ArrTime'] = stringTime_to_datetime(flight_df['clean_ArrTime'])
flight_df['clean_CRSArrTime'] = stringTime_to_datetime(flight_df['clean_CRSArrTime'])
# Show the results
flight_df[['clean_DepTime','clean_CRSDepTime','clean_ArrTime','clean_CRSArrTime']].head()
| clean_DepTime | clean_CRSDepTime | clean_ArrTime | clean_CRSArrTime | |
|---|---|---|---|---|
| 0 | 20:03:00 | 19:55:00 | 22:11:00 | 22:25:00 |
| 1 | 07:54:00 | 07:35:00 | 10:02:00 | 10:00:00 |
| 2 | 06:28:00 | 06:20:00 | 08:04:00 | 07:50:00 |
| 3 | 18:29:00 | 17:55:00 | 19:59:00 | 19:25:00 |
| 4 | 19:40:00 | 19:15:00 | 21:21:00 | 21:10:00 |
# Create a function to get how many missing values by cancellation and divertion status
def get_missing_info(df_name, df_column_name):
df = df_name.copy()
df['Category'] = np.where(df_column_name.isnull(), 'NULL', 'Not NULL')
df = df.groupby(['Category','Cancelled','Diverted'], as_index = False).size().rename(columns={'size':'Total Flights'})
return df
# Get info of missing TailNum
get_missing_info(flight_df, flight_df['TailNum'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 0 | 3 |
| 1 | NULL | 0 | 1 | 2 |
| 2 | Not NULL | 0 | 0 | 1842334 |
| 3 | Not NULL | 0 | 1 | 7497 |
| 4 | Not NULL | 1 | 0 | 625 |
# Create a function to get the 'mode' of a particular value.
# The mode value will be replace the missing value.
def get_most_frequent_value(df_name, group_by_columns, rank_window_columns, rename_column_from, rename_column_to):
df = flight_df.groupby(group_by_columns, as_index = False).size().rename(columns={'size':'Total Flights'})
df_rank = df.copy()
df_rank['Rank'] = df.groupby(rank_window_columns)['Total Flights'].rank(method='first', ascending=False)
most_frequent_value = df_rank[df_rank['Rank']==1].rename(columns={rename_column_from : rename_column_to})
return most_frequent_value
# Get the mode value of TailNum, grouped by FlightNum, Origin, and Dest
most_frequent_TailNum_df = get_most_frequent_value(flight_df,
group_by_columns=['FlightNum','Origin','Dest','TailNum'],
rank_window_columns=['FlightNum','Origin','Dest'],
rename_column_from='TailNum',
rename_column_to='MostFrequentTailNum'
)
most_frequent_TailNum_df.head()
| FlightNum | Origin | Dest | MostFrequentTailNum | Total Flights | Rank | |
|---|---|---|---|---|---|---|
| 1 | 1 | AUS | ONT | N11187 | 5 | 1.0 |
| 29 | 1 | BUF | JFK | N523JB | 2 | 1.0 |
| 59 | 1 | DAL | HOU | N510SW | 2 | 1.0 |
| 83 | 1 | DCA | SEA | N569AS | 5 | 1.0 |
| 114 | 1 | HOU | CRP | N514SW | 2 | 1.0 |
# Merge most_frequent_TailNum_df to the original dataframe
flight_df = flight_df.merge(most_frequent_TailNum_df[['FlightNum','Origin','Dest','MostFrequentTailNum']], how = 'left', left_on = ['FlightNum','Origin', 'Dest'], right_on = ['FlightNum','Origin','Dest'])
flight_df.head()
| Year | Month | DayofMonth | DayOfWeek | UniqueCarrier | FlightNum | TailNum | ActualElapsedTime | CRSElapsedTime | AirTime | ... | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | clean_DepTime | clean_CRSDepTime | clean_ArrTime | clean_CRSArrTime | MostFrequentTailNum | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 3 | 4 | WN | 335 | N712SW | 128.0 | 150.0 | 116.0 | ... | NaN | NaN | NaN | NaN | NaN | 20:03:00 | 19:55:00 | 22:11:00 | 22:25:00 | N219WN |
| 1 | 2008 | 1 | 3 | 4 | WN | 3231 | N772SW | 128.0 | 145.0 | 113.0 | ... | NaN | NaN | NaN | NaN | NaN | 07:54:00 | 07:35:00 | 10:02:00 | 10:00:00 | N772SW |
| 2 | 2008 | 1 | 3 | 4 | WN | 448 | N428WN | 96.0 | 90.0 | 76.0 | ... | NaN | NaN | NaN | NaN | NaN | 06:28:00 | 06:20:00 | 08:04:00 | 07:50:00 | N237WN |
| 3 | 2008 | 1 | 3 | 4 | WN | 3920 | N464WN | 90.0 | 90.0 | 77.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 32.0 | 18:29:00 | 17:55:00 | 19:59:00 | 19:25:00 | N293 |
| 4 | 2008 | 1 | 3 | 4 | WN | 378 | N726SW | 101.0 | 115.0 | 87.0 | ... | NaN | NaN | NaN | NaN | NaN | 19:40:00 | 19:15:00 | 21:21:00 | 21:10:00 | N263WN |
5 rows × 30 columns
# Replace missing TailNum with the most frequent TailNum for the given FlightNum, Origin, and Dest
clean_TailNum = []
for i, col in flight_df['TailNum'].items():
if str(col) != 'nan':
clean_TailNum.append(col)
elif str(col) == 'nan' and flight_df['Cancelled'][i] == 1:
clean_TailNum.append(flight_df['MostFrequentTailNum'][i])
elif str(col) == 'nan' and flight_df['Cancelled'][i] == 1:
clean_TailNum.append(flight_df['MostFrequentTailNum'][i])
else:
clean_TailNum.append(np.nan)
# Get the clean TailNum
flight_df['clean_TailNum'] = clean_TailNum
# Remove rows that is still missing after imputation
flight_df = flight_df[flight_df['clean_TailNum'].notnull()]
# Drop unnecessary columns
flight_df = flight_df.drop(columns={'TailNum','MostFrequentTailNum'})
# Get info of missing CRSElapsedTime
get_missing_info(flight_df, flight_df['CRSElapsedTime'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 198 |
| 1 | Not NULL | 0 | 0 | 1842334 |
| 2 | Not NULL | 0 | 1 | 7299 |
| 3 | Not NULL | 1 | 0 | 625 |
# Get the mode value of CRSElapsedTime, grouped by FlightNum, Origin, and Dest
most_frequent_CRSElapsedTime_df = get_most_frequent_value(flight_df,
group_by_columns=['FlightNum','Origin','Dest','CRSElapsedTime'],
rank_window_columns=['FlightNum','Origin','Dest'],
rename_column_from='CRSElapsedTime',
rename_column_to='MostFrequentCRSElapsedTime'
)
most_frequent_CRSElapsedTime_df.head()
| FlightNum | Origin | Dest | MostFrequentCRSElapsedTime | Total Flights | Rank | |
|---|---|---|---|---|---|---|
| 0 | 1 | AUS | ONT | 178.0 | 27 | 1.0 |
| 3 | 1 | BUF | JFK | 80.0 | 11 | 1.0 |
| 5 | 1 | DAL | HOU | 60.0 | 30 | 1.0 |
| 9 | 1 | DCA | SEA | 349.0 | 9 | 1.0 |
| 12 | 1 | HOU | CRP | 50.0 | 42 | 1.0 |
# Merge most_frequent_CRSElapsedTime to the original dataframe
flight_df = flight_df.merge(most_frequent_CRSElapsedTime_df[['FlightNum','Origin','Dest','MostFrequentCRSElapsedTime']], how = 'left', left_on = ['FlightNum','Origin', 'Dest'], right_on = ['FlightNum','Origin','Dest'])
flight_df.head()
| Year | Month | DayofMonth | DayOfWeek | UniqueCarrier | FlightNum | ActualElapsedTime | CRSElapsedTime | AirTime | ArrDelay | ... | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | clean_DepTime | clean_CRSDepTime | clean_ArrTime | clean_CRSArrTime | clean_TailNum | MostFrequentCRSElapsedTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 3 | 4 | WN | 335 | 128.0 | 150.0 | 116.0 | -14.0 | ... | NaN | NaN | NaN | NaN | 20:03:00 | 19:55:00 | 22:11:00 | 22:25:00 | N712SW | 150.0 |
| 1 | 2008 | 1 | 3 | 4 | WN | 3231 | 128.0 | 145.0 | 113.0 | 2.0 | ... | NaN | NaN | NaN | NaN | 07:54:00 | 07:35:00 | 10:02:00 | 10:00:00 | N772SW | 145.0 |
| 2 | 2008 | 1 | 3 | 4 | WN | 448 | 96.0 | 90.0 | 76.0 | 14.0 | ... | NaN | NaN | NaN | NaN | 06:28:00 | 06:20:00 | 08:04:00 | 07:50:00 | N428WN | 90.0 |
| 3 | 2008 | 1 | 3 | 4 | WN | 3920 | 90.0 | 90.0 | 77.0 | 34.0 | ... | 0.0 | 0.0 | 0.0 | 32.0 | 18:29:00 | 17:55:00 | 19:59:00 | 19:25:00 | N464WN | 90.0 |
| 4 | 2008 | 1 | 3 | 4 | WN | 378 | 101.0 | 115.0 | 87.0 | 11.0 | ... | NaN | NaN | NaN | NaN | 19:40:00 | 19:15:00 | 21:21:00 | 21:10:00 | N726SW | 115.0 |
5 rows × 30 columns
# Replace missing CRSElapsedTime with the most frequent TailNum for the given FlightNum, Origin, and Dest
clean_CRSElapsedTime = []
for i, col in flight_df['CRSElapsedTime'].items():
if str(col) != 'nan':
clean_CRSElapsedTime.append(col)
elif str(col) == 'nan':
clean_CRSElapsedTime.append(flight_df['MostFrequentCRSElapsedTime'][i])
else:
clean_CRSElapsedTime.append(np.nan)
# Get the clean CRSElapsedTime
flight_df['clean_CRSElapsedTime'] = clean_CRSElapsedTime
# Remove rows that is still missing after imputation
flight_df = flight_df[flight_df['clean_CRSElapsedTime'].notnull()]
# Drop unnecessary columns
flight_df = flight_df.drop(columns={'CRSElapsedTime','MostFrequentCRSElapsedTime'})
# Get info of missing ActualElapsedTime
get_missing_info(flight_df, flight_df['ActualElapsedTime'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 7477 |
| 1 | NULL | 1 | 0 | 625 |
| 2 | Not NULL | 0 | 0 | 1842334 |
# Replace null values in ActualElapsedTime into 0
flight_df['clean_ActualElapsedTime'] = flight_df['ActualElapsedTime'].fillna(0)
flight_df = flight_df.drop(columns={'ActualElapsedTime'})
# Get info of missing clean_ArrTime
get_missing_info(flight_df, flight_df['clean_ArrTime'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 6343 |
| 1 | NULL | 1 | 0 | 625 |
| 2 | Not NULL | 0 | 0 | 1842334 |
| 3 | Not NULL | 0 | 1 | 1134 |
%%time
from datetime import datetime
from datetime import timedelta
calc_ArrTime = []
for i, col in flight_df['clean_DepTime'].items():
if str(flight_df['clean_ArrTime'][i]) == 'nan' and flight_df['Cancelled'][i] == 1: # Yang cancelled biarin kosong
calc_ArrTime.append('None')
elif str(col) != 'None' and str(flight_df['clean_ActualElapsedTime'][i]) != 'nan':
col_dt = datetime.strptime(str(col), '%H:%M:%S') + timedelta(minutes = int(flight_df['clean_ActualElapsedTime'][i]))
calc_ArrTime.append(col_dt)
else:
calc_ArrTime.append(col)
flight_df['calculated_ArrTime'] = calc_ArrTime
CPU times: user 1min 6s, sys: 400 ms, total: 1min 7s Wall time: 1min 7s
from datetime import datetime
def string_to_hhmmss(df):
result = []
for i, col in df.items():
if str(col) != 'nan' and str(col) != 'None':
converted_to_datetime = datetime.strptime(str(col), '%Y-%m-%d %H:%M:%S')
result.append(converted_to_datetime.strftime('%H:%M:%S'))
else:
result.append(np.nan)
return result
%%time
flight_df['calculated_ArrTime'] = string_to_hhmmss(flight_df['calculated_ArrTime'])
CPU times: user 22.5 s, sys: 139 ms, total: 22.6 s Wall time: 22.7 s
# Replace missing ArrTime with calculated_ArrTime
# Change clean_ArrTime from the destination's local time to the origin's local time
clean_ArrTime = []
for i, col in flight_df['clean_ArrTime'].items():
if str(col) != 'nan':
clean_ArrTime.append(flight_df['calculated_ArrTime'][i])
elif str(col) == 'nan' and flight_df['Cancelled'][i] == 1:
clean_ArrTime.append('None')
elif str(col) == 'nan' and flight_df['Cancelled'][i] == 0:
clean_ArrTime.append(flight_df['calculated_ArrTime'][i])
else:
clean_ArrTime.append(np.nan)
flight_df['clean_ArrTime'] = clean_ArrTime
flight_df = flight_df.drop(columns={'calculated_ArrTime'})
# Get info of missing AirTime
get_missing_info(flight_df, flight_df['AirTime'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 7477 |
| 1 | NULL | 1 | 0 | 625 |
| 2 | Not NULL | 0 | 0 | 1842334 |
# Replace null values in AirTime into 0
flight_df['AirTime'] = flight_df['AirTime'].fillna(0)
# Get info of missing TaxiIn
get_missing_info(flight_df, flight_df['TaxiIn'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 6343 |
| 1 | NULL | 1 | 0 | 625 |
| 2 | Not NULL | 0 | 0 | 1842334 |
| 3 | Not NULL | 0 | 1 | 1134 |
# Get info of missing TaxiOut
get_missing_info(flight_df, flight_df['TaxiOut'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 1 | 0 | 449 |
| 1 | Not NULL | 0 | 0 | 1842334 |
| 2 | Not NULL | 0 | 1 | 7477 |
| 3 | Not NULL | 1 | 0 | 176 |
# Replace null values in TaxiIn into 0
flight_df['TaxiIn'] = flight_df['TaxiIn'].fillna(0)
# Replace null values in TaxiIn into 0
flight_df['TaxiOut'] = flight_df['TaxiOut'].fillna(0)
# Get info of missing ArrDelay
get_missing_info(flight_df, flight_df['ArrDelay'])
| Category | Cancelled | Diverted | Total Flights | |
|---|---|---|---|---|
| 0 | NULL | 0 | 1 | 7477 |
| 1 | NULL | 1 | 0 | 625 |
| 2 | Not NULL | 0 | 0 | 1842334 |
# Replace null values in ArrDelay into 0
flight_df['ArrDelay'] = flight_df['ArrDelay'].fillna(0)
# Replace null values into 0
flight_df['CarrierDelay'] = flight_df['CarrierDelay'].fillna(0)
flight_df['WeatherDelay'] = flight_df['WeatherDelay'].fillna(0)
flight_df['NASDelay'] = flight_df['NASDelay'].fillna(0)
flight_df['SecurityDelay'] = flight_df['SecurityDelay'].fillna(0)
flight_df['LateAircraftDelay'] = flight_df['LateAircraftDelay'].fillna(0)
# Check null values in flight dataframe
check_missing_values(flight_df, "Flight Dataframe")
Percentage of null values in Flight Dataframe --------------------------------------------- Percentage of null values in column Year = 0.0% Percentage of null values in column Month = 0.0% Percentage of null values in column DayofMonth = 0.0% Percentage of null values in column DayOfWeek = 0.0% Percentage of null values in column UniqueCarrier = 0.0% Percentage of null values in column FlightNum = 0.0% Percentage of null values in column AirTime = 0.0% Percentage of null values in column ArrDelay = 0.0% Percentage of null values in column DepDelay = 0.0% Percentage of null values in column Origin = 0.0% Percentage of null values in column Dest = 0.0% Percentage of null values in column Distance = 0.0% Percentage of null values in column TaxiIn = 0.0% Percentage of null values in column TaxiOut = 0.0% Percentage of null values in column Cancelled = 0.0% Percentage of null values in column CancellationCode = 0.0% Percentage of null values in column Diverted = 0.0% Percentage of null values in column CarrierDelay = 0.0% Percentage of null values in column WeatherDelay = 0.0% Percentage of null values in column NASDelay = 0.0% Percentage of null values in column SecurityDelay = 0.0% Percentage of null values in column LateAircraftDelay = 0.0% Percentage of null values in column clean_DepTime = 0.0% Percentage of null values in column clean_CRSDepTime = 0.0% Percentage of null values in column clean_ArrTime = 0.0% Percentage of null values in column clean_CRSArrTime = 0.0% Percentage of null values in column clean_TailNum = 0.0% Percentage of null values in column clean_CRSElapsedTime = 0.0% Percentage of null values in column clean_ActualElapsedTime = 0.0%
DatasetSize = flight_df.shape[0]
CancelledFlightCount = flight_df['Cancelled'].value_counts()[1]
DivertedFlightCount = flight_df['Diverted'].value_counts()[1]
UniqueCarrierCount = flight_df.agg({'UniqueCarrier':pd.Series.nunique})[0]
FlightNumCount = flight_df.agg({'FlightNum':pd.Series.nunique})[0]
OriginCount = flight_df.agg({'Origin':pd.Series.nunique})[0]
DestCount = flight_df.agg({'Dest':pd.Series.nunique})[0]
print('SOME INFORMATION ABOUT THE DATASET')
print(f'There are {DatasetSize} records in the dataset with {CancelledFlightCount} cancelled flights and {DivertedFlightCount} diverted flights')
print(f'There are {UniqueCarrierCount} unique carriers in the dataset')
print(f'There are {FlightNumCount} unique flight numbers in the dataset')
print(f'There are {OriginCount} unique origins in the dataset')
print(f'There are {DestCount} unique destinations in the dataset')
SOME INFORMATION ABOUT THE DATASET There are 1850436 records in the dataset with 625 cancelled flights and 7477 diverted flights There are 20 unique carriers in the dataset There are 7496 unique flight numbers in the dataset There are 303 unique origins in the dataset There are 304 unique destinations in the dataset
# Number of cancellation by UniqueCarrier
count_cancelled_by_carrier_df = flight_df[flight_df['Cancelled']==1].groupby(['UniqueCarrier'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
# Number of cancellation by month
count_cancelled_by_month_df = flight_df[flight_df['Cancelled']==1].groupby(['Month'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
# Top 10 origins by the highest number of cancellations
count_cancelled_by_origin_df = flight_df[flight_df['Cancelled']==1].groupby(['Origin'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
# Top 10 destinations by the highest number of cancellations
count_cancelled_by_destination_df = flight_df[flight_df['Cancelled']==1].groupby(['Dest'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
def get_pct_of_total(df):
result = []
for i, col in df['TotalFlights'].items():
pct = col/df['TotalFlights'].sum()*100
result.append(pct)
df['TotalFlightsPct'] = result
df['TotalFlightsPctCumSum'] = df['TotalFlightsPct'].cumsum()
return df
count_cancelled_by_carrier_df = get_pct_of_total(count_cancelled_by_carrier_df)
count_cancelled_by_month_df = get_pct_of_total(count_cancelled_by_month_df)
count_cancelled_by_origin_df = get_pct_of_total(count_cancelled_by_origin_df)
count_cancelled_by_destination_df = get_pct_of_total(count_cancelled_by_destination_df)
fig, ([eda_ax1, eda_ax2, eda_ax3], [eda_ax4, eda_ax5, eda_ax6]) = plt.subplots(nrows=2, ncols=3, figsize=(30, 15))
fig.suptitle('Exploratory Data Analysis on Flight Data', x=0.5, y=0.97, fontsize=24, fontweight='bold')
subplot_title_font_size = 14
sns.set_palette(sns.color_palette('crest'))
eda_ax1.set_title('Most popular airlines with flight cancellations', fontsize=subplot_title_font_size)
sns.barplot(x='UniqueCarrier', y='TotalFlights', data=count_cancelled_by_carrier_df, palette='crest', ax=eda_ax1)
eda_ax1 = eda_ax1.twinx()
sns.lineplot(x='UniqueCarrier', y='TotalFlightsPctCumSum', data=count_cancelled_by_carrier_df, palette='crest', ax=eda_ax1)
sns.scatterplot(x='UniqueCarrier', y='TotalFlightsPctCumSum', data=count_cancelled_by_carrier_df, palette='crest', ax=eda_ax1)
eda_ax2.set_title('Number of flight cancellations by month', fontsize=subplot_title_font_size)
sns.barplot(x='Month', y='TotalFlights', data=count_cancelled_by_month_df, palette='crest', ax=eda_ax2)
eda_ax3.set_title('Top 10 origins by the highest number of cancellations', fontsize=subplot_title_font_size)
sns.barplot(x='Origin', y='TotalFlights', data=count_cancelled_by_origin_df.head(10), palette='crest', ax=eda_ax3)
eda_ax3 = eda_ax3.twinx()
sns.lineplot(x='Origin', y='TotalFlightsPctCumSum', data=count_cancelled_by_origin_df.head(10), palette='crest', ax=eda_ax3)
sns.scatterplot(x='Origin', y='TotalFlightsPctCumSum', data=count_cancelled_by_origin_df.head(10), palette='crest', ax=eda_ax3)
eda_ax4.set_title('Top 10 destinations by the highest number of cancellations', fontsize=subplot_title_font_size)
sns.barplot(x='Dest', y='TotalFlights', data=count_cancelled_by_destination_df.head(10), palette='crest', ax=eda_ax4)
eda_ax4 = eda_ax4.twinx()
sns.lineplot(x='Dest', y='TotalFlightsPctCumSum', data=count_cancelled_by_destination_df.head(10), palette='crest', ax=eda_ax4)
sns.scatterplot(x='Dest', y='TotalFlightsPctCumSum', data=count_cancelled_by_destination_df.head(10), palette='crest', ax=eda_ax4)
eda_ax5.set_title('Distribution of Distance', fontsize=subplot_title_font_size)
sns.histplot(x=flight_df['Distance'], bins=10, ax=eda_ax5)
eda_ax6.set_title('Distribution of CRSElapsedTime', fontsize=subplot_title_font_size)
sns.histplot(x=flight_df['clean_CRSElapsedTime'], bins=10, ax=eda_ax6)
plt.show()
flight_df = flight_df[flight_df['Month'].isin([10,11,12])]
flight_df = flight_df.reset_index().drop(columns={'index'})
print(f'The dataset has {flight_df.shape[0]} rows and {flight_df.shape[1]} columns')
cancelled_count = flight_df['Cancelled'].value_counts()[1]
print(f'There are {cancelled_count} cancelled flights out of {flight_df.shape[0]} records')
The dataset has 397541 rows and 29 columns There are 625 cancelled flights out of 397541 records
def get_hour_category(df):
hour_category = []
for i, col in df.items():
hour_category.append(col.strftime('%H'))
return hour_category
flight_df['clean_CRSDepTime_hour'] = get_hour_category(flight_df['clean_CRSDepTime'])
flight_df['clean_CRSArrTime_hour'] = get_hour_category(flight_df['clean_CRSArrTime'])
count_cancelled_by_dow_df = flight_df[flight_df['Cancelled']==1].groupby('DayOfWeek',as_index=False).size().rename(columns={'size':'TotalCancelledFlights'})
count_flight_by_dow_df = flight_df.groupby('DayOfWeek',as_index=False).size().rename(columns={'size':'TotalFlights'})
count_by_dow_df = count_flight_by_dow_df.merge(count_cancelled_by_dow_df, how='left', left_on='DayOfWeek', right_on='DayOfWeek')
count_by_dow_df
| DayOfWeek | TotalFlights | TotalCancelledFlights | |
|---|---|---|---|
| 0 | 1 | 61062 | 66 |
| 1 | 2 | 53282 | 118 |
| 2 | 3 | 57579 | 103 |
| 3 | 4 | 54528 | 67 |
| 4 | 5 | 62894 | 102 |
| 5 | 6 | 46359 | 77 |
| 6 | 7 | 61837 | 92 |
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
ax1.set_title('Total Flights by Day of Week', fontsize=14)
sns.barplot(x='DayOfWeek', y='TotalFlights', data=count_by_dow_df, palette='crest', ax=ax1)
ax2.set_title('Total Cancelled Flights by Day of Week', fontsize=14)
sns.barplot(x='DayOfWeek', y='TotalCancelledFlights', data=count_by_dow_df, palette='crest', ax=ax2)
<AxesSubplot:title={'center':'Total Cancelled Flights by Day of Week'}, xlabel='DayOfWeek', ylabel='TotalCancelledFlights'>
def get_DayOfWeek_category(df):
DayOfWeekCategory = []
for i, col in df.items():
if col == 6 | col == 7:
DayOfWeekCategory.append('Weekend')
else:
DayOfWeekCategory.append('Weekday')
return DayOfWeekCategory
flight_df['DayOfWeekCategory'] = get_DayOfWeek_category(flight_df['DayOfWeek'])
count_flight_by_CRSDepTime_df = flight_df.groupby(['clean_CRSDepTime_hour'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
count_flight_by_CRSArrTime_df = flight_df.groupby(['clean_CRSArrTime_hour'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
count_cancelled_by_CRSDepTime_df = flight_df[flight_df['Cancelled']==1].groupby(['clean_CRSDepTime_hour'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
count_cancelled_by_CRSArrTime_df = flight_df[flight_df['Cancelled']==1].groupby(['clean_CRSArrTime_hour'],as_index=False).size().rename(columns={'size':'TotalFlights'}).sort_values('TotalFlights',ascending=False)
count_flight_by_CRSDepTime_df = get_pct_of_total(count_flight_by_CRSDepTime_df)
count_flight_by_CRSArrTime_df = get_pct_of_total(count_flight_by_CRSArrTime_df)
count_cancelled_by_CRSDepTime_df = get_pct_of_total(count_cancelled_by_CRSDepTime_df)
count_cancelled_by_CRSArrTime_df = get_pct_of_total(count_cancelled_by_CRSArrTime_df)
fig, ([fe_ax1, fe_ax2, fe_ax3], [fe_ax4, fe_ax5, fe_ax6]) = plt.subplots(nrows=2, ncols=3, figsize=(30,15))
fig.suptitle('Feature Engineering Result on CRSDepTime, CRSArrTime, and DayofWeek', x=0.5, y=0.94, fontsize=24, fontweight='bold')
subplot_title_font_size = 14
sns.set_palette(sns.color_palette('crest'))
fe_ax1.set_title('Cancelled flights by hour of departure', fontsize=subplot_title_font_size)
sns.barplot(x='clean_CRSDepTime_hour', y='TotalFlights', data=count_cancelled_by_CRSDepTime_df, palette='crest', ax=fe_ax1)
fe_ax1 = fe_ax1.twinx()
sns.lineplot(x='clean_CRSDepTime_hour', y='TotalFlightsPctCumSum', data=count_cancelled_by_CRSDepTime_df, palette='crest', ax=fe_ax1)
sns.scatterplot(x='clean_CRSDepTime_hour', y='TotalFlightsPctCumSum', data=count_cancelled_by_CRSDepTime_df, palette='crest', ax=fe_ax1)
fe_ax2.set_title('Cancelled flights by hour of arrival', fontsize=subplot_title_font_size)
sns.barplot(x='clean_CRSArrTime_hour', y='TotalFlights', data=count_cancelled_by_CRSArrTime_df, palette='crest', ax=fe_ax2)
fe_ax2 = fe_ax2.twinx()
sns.lineplot(x='clean_CRSArrTime_hour', y='TotalFlightsPctCumSum', data=count_cancelled_by_CRSArrTime_df, palette='crest', ax=fe_ax2)
sns.scatterplot(x='clean_CRSArrTime_hour', y='TotalFlightsPctCumSum', data=count_cancelled_by_CRSArrTime_df, palette='crest', ax=fe_ax2)
fe_ax3.set_title('Cancelled flights by day of week category', fontsize=subplot_title_font_size)
sns.countplot(x='DayOfWeekCategory', data=flight_df[flight_df['Cancelled']==1], palette="crest", ax=fe_ax3)
fe_ax4.set_title('Total flights by hour of departure', fontsize=subplot_title_font_size)
sns.barplot(x='clean_CRSDepTime_hour', y='TotalFlights', data=count_flight_by_CRSDepTime_df, palette='crest', ax=fe_ax4)
fe_ax4 = fe_ax4.twinx()
sns.lineplot(x='clean_CRSDepTime_hour', y='TotalFlightsPctCumSum', data=count_flight_by_CRSDepTime_df, palette='crest', ax=fe_ax4)
sns.scatterplot(x='clean_CRSDepTime_hour', y='TotalFlightsPctCumSum', data=count_flight_by_CRSDepTime_df, palette='crest', ax=fe_ax4)
fe_ax5.set_title('Total flights by hour of arrival', fontsize=subplot_title_font_size)
sns.barplot(x='clean_CRSArrTime_hour', y='TotalFlights', data=count_flight_by_CRSArrTime_df, palette='crest', ax=fe_ax5)
fe_ax5 = fe_ax5.twinx()
sns.lineplot(x='clean_CRSArrTime_hour', y='TotalFlightsPctCumSum', data=count_flight_by_CRSArrTime_df, palette='crest', ax=fe_ax5)
sns.scatterplot(x='clean_CRSArrTime_hour', y='TotalFlightsPctCumSum', data=count_flight_by_CRSArrTime_df, palette='crest', ax=fe_ax5)
fe_ax6.set_title('Total flights by day of week category', fontsize=subplot_title_font_size)
sns.countplot(x='DayOfWeekCategory', data=flight_df, palette="crest", ax=fe_ax6)
plt.show()
busy_CRSDepTime_hour = []
for i, col in flight_df['clean_CRSDepTime_hour'].items():
busy_CRSDepTime_list = ['19','17','16','18','14']
if str(col) in busy_CRSDepTime_list:
busy_CRSDepTime_hour.append(1)
else:
busy_CRSDepTime_hour.append(0)
busy_CRSArrTime_hour = []
for i, col in flight_df['clean_CRSArrTime_hour'].items():
busy_CRSArrTime_list = ['20','17','18','22','19','21']
if str(col) in busy_CRSArrTime_list:
busy_CRSArrTime_hour.append(1)
else:
busy_CRSArrTime_hour.append(0)
flight_df['busy_CRSDepTime_hour'] = busy_CRSDepTime_hour
flight_df['busy_CRSArrTime_hour'] = busy_CRSArrTime_hour
count_cancelled_by_carrier_df['busy_UniqueCarrier'] = np.where(count_cancelled_by_carrier_df['TotalFlightsPctCumSum'] <=50, 1, 0)
busy_UniqueCarrier_list = count_cancelled_by_carrier_df[count_cancelled_by_carrier_df['busy_UniqueCarrier']==1]['UniqueCarrier'].to_list()
count_cancelled_by_origin_df['busy_Origin'] = np.where(count_cancelled_by_origin_df['TotalFlightsPctCumSum'] <=20, 1, 0)
busy_Origin_list = count_cancelled_by_origin_df[count_cancelled_by_origin_df['busy_Origin']==1]['Origin'].to_list()
count_cancelled_by_destination_df['busy_Dest'] = np.where(count_cancelled_by_destination_df['TotalFlightsPctCumSum'] <=20, 1, 0)
busy_Dest_list = count_cancelled_by_destination_df[count_cancelled_by_destination_df['busy_Dest']==1]['Dest'].to_list()
busy_UniqueCarrier_flag = []
for i, col in flight_df['UniqueCarrier'].items():
if str(col) in busy_UniqueCarrier_list:
busy_UniqueCarrier_flag.append(1)
else:
busy_UniqueCarrier_flag.append(0)
flight_df['busy_UniqueCarrier'] = busy_UniqueCarrier_flag
busy_Origin_flag = []
for i, col in flight_df['Origin'].items():
if str(col) in busy_Origin_list:
busy_Origin_flag.append(1)
else:
busy_Origin_flag.append(0)
flight_df['busy_Origin'] = busy_Origin_flag
busy_Dest_flag = []
for i, col in flight_df['Dest'].items():
if str(col) in busy_Dest_list:
busy_Dest_flag.append(1)
else:
busy_Dest_flag.append(0)
flight_df['busy_Dest'] = busy_Dest_flag
def get_delay_status(df):
delay=[]
for i, col in df.items():
if col > 15:
delay.append(1)
else:
delay.append(0)
return delay
flight_df['TotalDelay'] = flight_df['DepDelay'] + flight_df['ArrDelay'] + flight_df['CarrierDelay'] + flight_df['WeatherDelay'] + flight_df['NASDelay']
flight_df['Delay'] = get_delay_status(flight_df['TotalDelay'])
cancelled_df = flight_df.groupby(['Cancelled'], as_index=False).size().rename(columns={'size':'TotalFlights','Cancelled':'Status'})
cancelled_df['Category'] = 'Cancelled'
diverted_df = flight_df.groupby(['Diverted'], as_index=False).size().rename(columns={'size':'TotalFlights','Diverted':'Status'})
diverted_df['Category'] = 'Diverted'
delay_df = flight_df.groupby(['Delay'], as_index=False).size().rename(columns={'size':'TotalFlights','Delay':'Status'})
delay_df['Category'] = 'Delay'
flight_status_df = pd.concat([cancelled_df, diverted_df, delay_df]).reset_index().drop(columns={'index'})
flight_status_df['Status'] = flight_status_df['Status'].replace({1:'Yes',0:'No'})
plotly_colors = ['#9dc098', '#2b667c']
flight_status_fig = make_subplots(rows=1, cols=3,
specs=[[{'type':'domain'},
{'type':'domain'},
{'type':'domain'}]])
flight_status_fig.add_trace(go.Pie(labels=flight_status_df[flight_status_df['Category']=='Cancelled']['Status'],
values=flight_status_df[flight_status_df['Category']=='Cancelled']['TotalFlights'],
name="Cancelled",
marker_colors=plotly_colors), 1, 1)
flight_status_fig.add_trace(go.Pie(labels=flight_status_df[flight_status_df['Category']=='Diverted']['Status'],
values=flight_status_df[flight_status_df['Category']=='Diverted']['TotalFlights'],
name="Diverted",
marker_colors=plotly_colors), 1, 2)
flight_status_fig.add_trace(go.Pie(labels=flight_status_df[flight_status_df['Category']=='Delay']['Status'],
values=flight_status_df[flight_status_df['Category']=='Delay']['TotalFlights'],
name="Delay",
marker_colors=plotly_colors), 1, 3)
flight_status_fig.update_traces(hole=0.5, hoverinfo="label+percent+name")
flight_status_fig.update_layout(
title="<b>Percentage of flights by flight status<b>",
legend=dict(
orientation='h',
x=0.42,
y=1.1,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
annotations=[dict(text='<b>Cancelled<b>', x=0.1, y=0.05, font_size=12, showarrow=False),
dict(text='<b>Diverted<b>', x=0.5, y=0.05, font_size=12, showarrow=False),
dict(text='<b>Delay<b>', x=0.885, y=0.05, font_size=12, showarrow=False)],
plot_bgcolor='rgba(0, 0, 0, 0)',
paper_bgcolor='rgba(0, 0, 0, 0)',
uniformtext_minsize=8,
uniformtext_mode='show'
)
flight_status_fig.show()
fig, ([num_ax1, num_ax2, num_ax3],[num_ax4,num_ax5, num_ax6]) = plt.subplots(nrows=2, ncols=3, figsize=(20,18))
fig.suptitle('Relationship between numerical variables and target variables', x=0.5, y=0.93, fontsize=20, fontweight='bold')
colors = ['#59958a', '#2b667c']
sns.set_palette(sns.color_palette(colors))
num_ax1.set_title('Distance vs Cancelled', fontsize=14)
sns.scatterplot(data=flight_df,x=flight_df['Distance'], y=flight_df['Cancelled'], hue=flight_df['Cancelled'], palette=colors, ax=num_ax1)
num_ax2.set_title('CRSElapsedTime vs Cancelled', fontsize=14)
sns.scatterplot(data=flight_df, x=flight_df['clean_CRSElapsedTime'], y=flight_df['Cancelled'], hue=flight_df['Cancelled'], palette=colors, ax=num_ax2)
num_ax3.set_title('Distance vs Diverted', fontsize=14)
sns.scatterplot(data=flight_df,x=flight_df['Distance'], y=flight_df['Diverted'], hue=flight_df['Diverted'], palette=colors, ax=num_ax3)
num_ax4.set_title('CRSElapsedTime vs Diverted', fontsize=14)
sns.scatterplot(data=flight_df, x=flight_df['clean_CRSElapsedTime'], y=flight_df['Diverted'], hue=flight_df['Diverted'], palette=colors, ax=num_ax4)
num_ax5.set_title('Distance vs Delay', fontsize=14)
sns.scatterplot(data=flight_df,x=flight_df['Distance'], y=flight_df['Delay'], hue=flight_df['Delay'], palette=colors, ax=num_ax5)
num_ax6.set_title('CRSElapsedTime vs Delay', fontsize=14)
sns.scatterplot(data=flight_df, x=flight_df['clean_CRSElapsedTime'], y=flight_df['Delay'], hue=flight_df['Delay'], palette=colors, ax=num_ax6)
plt.show()
def transform_categorical_variables(df, columns_to_transform):
transformed_data = df.copy()
for col in columns_to_transform:
transformed_data = pd.concat([transformed_data, pd.get_dummies(transformed_data[col].values, prefix=col)], axis=1)
return transformed_data
%%time
selected_categorical_variables = ['Month','DayOfWeekCategory','Cancelled', 'Delay']
transformed_flight_df = flight_df[selected_categorical_variables]
transformed_flight_df = transformed_flight_df.reset_index().drop(columns={'index'})
transformed_flight_df = transform_categorical_variables(transformed_flight_df, columns_to_transform = selected_categorical_variables)
CPU times: user 88.8 ms, sys: 64.5 ms, total: 153 ms Wall time: 184 ms
# Drop unnecessary columns
transformed_flight_df = transformed_flight_df.drop(columns={'Cancelled_0', 'Cancelled_1','Delay_0','Delay_1'})
other_features = flight_df[['Distance','clean_CRSElapsedTime','busy_UniqueCarrier','busy_Origin','busy_Dest','busy_CRSDepTime_hour','busy_CRSArrTime_hour']]
selected_features_df = pd.concat([transformed_flight_df, other_features], axis=1)
# Drop unnecessary columns
cancellation_problem_selected_features_df = selected_features_df.drop(columns={'Delay'})
y = cancellation_problem_selected_features_df['Cancelled']
x = cancellation_problem_selected_features_df.drop(columns={'Cancelled'})
# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train_records = x_train.shape[0]
y_train_records = y_train.shape[0]
x_test_records = x_test.shape[0]
y_test_records = y_test.shape[0]
print('Total records in each dataset')
print(f'x_train = {x_train_records} records')
print(f'y_train = {y_train_records} records')
print(f'x_test = {x_test_records} records')
print(f'y_test = {y_test_records} records')
print('\n')
y_train_df = pd.DataFrame()
y_train_df['Cancelled'] = y_train
print('Total cancellation in y_train')
print(y_train_df.value_counts())
print('\n')
y_test_df = pd.DataFrame()
y_test_df['Cancelled'] = y_test
print('Total cancellation in y_test')
print(y_test_df.value_counts())
Total records in each dataset x_train = 318032 records y_train = 318032 records x_test = 79509 records y_test = 79509 records Total cancellation in y_train Cancelled 0 317531 1 501 dtype: int64 Total cancellation in y_test Cancelled 0 79385 1 124 dtype: int64
# Fit scaler into training set
column_list = ['Distance', 'clean_CRSElapsedTime']
x_train_to_scale = x_train[column_list]
x_test_to_scale = x_test[column_list]
column_tranform = ColumnTransformer([('somename', StandardScaler(), column_list)], remainder='passthrough')
column_tranform.fit(x_train_to_scale)
# Scaled training data
scaled_x_train_result = column_tranform.transform(x_train_to_scale)
x_train_scaled = x_train.copy()
x_train_scaled[['Distance','clean_CRSElapsedTime']] = scaled_x_train_result
# Scaled test data
scaled_x_test_result = column_tranform.transform(x_test_to_scale)
x_test_scaled = x_test.copy()
x_test_scaled[['Distance','clean_CRSElapsedTime']] = column_tranform.transform(x_test_to_scale)
x_train_scaled = x_train_scaled.drop(columns={'Month','DayOfWeekCategory_Weekend', 'DayOfWeekCategory'})
x_test_scaled = x_test_scaled.drop(columns={'Month','DayOfWeekCategory_Weekend', 'DayOfWeekCategory'})
# Declare the training and test data
X_train = x_train_scaled
y_train = y_train.copy()
X_test = x_test_scaled
y_test = y_test.copy()
logreg_start_time = time.time()
logreg_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=42)],
['classifier', LogisticRegression(random_state=42)]])
logreg_param_grid = {'classifier__C':[0.001, 0.01]}
logreg_stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# Run the model
logreg_grid_search = GridSearchCV(estimator=logreg_pipeline,
param_grid=logreg_param_grid,
scoring='roc_auc',
cv=logreg_stratified_kfold,
n_jobs=-1,
refit=True)
logreg_grid_search.fit(X_train, y_train)
logreg_y_predicted = logreg_grid_search.best_estimator_.predict(X_test)
logreg_cpu_time = round(time.time() - logreg_start_time, 2)
print(f'CPU time: {logreg_cpu_time} seconds')
# Evaluate the model
logreg_cv_score = round(logreg_grid_search.best_score_*100,2)
logreg_accuracy = round(logreg_grid_search.score(X_test, y_test)*100,2)
logreg_best_classifier = logreg_grid_search.best_estimator_
logreg_MSE = mean_squared_error(y_test, logreg_y_predicted)
print(f'Best classifier {logreg_best_classifier}')
print(f'Logistics Regression Accuracy Rate (CV score): {logreg_cv_score}%')
print(f'Logistics Regression Accuracy Rate (Test score): {logreg_accuracy}%')
print(f'Logistics Regression MSE: {logreg_MSE}')
CPU time: 14.36 seconds
Best classifier Pipeline(steps=[('smote', SMOTE(random_state=42)),
['classifier', LogisticRegression(C=0.001, random_state=42)]])
Logistics Regression Accuracy Rate (CV score): 72.62%
Logistics Regression Accuracy Rate (Test score): 72.41%
Logistics Regression MSE: 0.3875787646681508
logreg_results = logreg_grid_search.fit(X_train, y_train)
metrics.plot_roc_curve(logreg_results, X_test, y_test)
logreg_predict_prob = logreg_grid_search.predict_proba(X_test)
logreg_model_probs = logreg_predict_prob[:, 1]
logreg_roc_auc = round(roc_auc_score(y_test, logreg_model_probs),2)
print(f'Logistics Regression AUC Score: {logreg_roc_auc}')
Logistics Regression AUC Score: 0.72
dt_start_time = time.time()
dt_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['classifier', DecisionTreeClassifier(random_state=42)]])
dt_param_grid = {'classifier__min_samples_split' : [2,3,4,5],
'classifier__min_samples_leaf' : [2,3,4,5],
'classifier__max_depth' : [2,3,4,5]}
dt_stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=11)
# Run the model
dt_grid_search = GridSearchCV(estimator=dt_pipeline,
param_grid=dt_param_grid,
scoring='roc_auc',
cv=dt_stratified_kfold,
n_jobs=-1,
refit=True)
dt_grid_search.fit(X_train, y_train)
dt_y_predicted = dt_grid_search.predict(X_test)
dt_cpu_time = round(time.time() - dt_start_time, 2)
print(f'CPU time: {dt_cpu_time} seconds')
# Evaluate the model
dt_cv_score = round(dt_grid_search.best_score_*100,2)
dt_accuracy = round(dt_grid_search.score(X_test, y_test)*100,2)
dt_best_classifier = dt_grid_search.best_estimator_
dt_MSE = mean_squared_error(y_test, dt_y_predicted)
print(f'Best classifier {dt_best_classifier}')
print(f'Decision Tree Accuracy Rate (CV score): {dt_cv_score}%')
print(f'Decision Tree Accuracy Rate (Test score): {dt_accuracy}%')
print(f'Decision Tree MSE: {dt_MSE}')
/Users/hutaminadya/Documents/anaconda3/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py:688: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
CPU time: 146.89 seconds
Best classifier Pipeline(steps=[('smote', SMOTE(random_state=11)),
['classifier',
DecisionTreeClassifier(max_depth=3, min_samples_leaf=2,
random_state=42)]])
Decision Tree Accuracy Rate (CV score): 70.52%
Decision Tree Accuracy Rate (Test score): 71.94%
Decision Tree MSE: 0.20320969953087073
dt_results = dt_grid_search.fit(X_train, y_train)
metrics.plot_roc_curve(dt_results, X_test, y_test)
dt_predict_prob = dt_grid_search.predict_proba(X_test)
dt_model_probs = dt_predict_prob[:, 1]
dt_roc_auc = round(roc_auc_score(y_test, dt_model_probs),2)
print(f'Decision Tree AUC Score: {dt_roc_auc}')
Decision Tree AUC Score: 0.72
rf_start_time = time.time()
rf_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['classifier', RandomForestClassifier(random_state = 42)]])
rf_param_grid = {'classifier__min_samples_split' : [2,3,5],
'classifier__min_samples_leaf' : [2,3,5],
'classifier__max_depth' : [3,5,10]}
rf_stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,random_state=11)
# Run the model
rf_grid_search = GridSearchCV(estimator=rf_pipeline,param_grid=rf_param_grid,scoring='roc_auc',cv=rf_stratified_kfold,n_jobs=-1,refit=True)
rf_grid_search.fit(X_train, y_train)
rf_y_predicted = rf_grid_search.predict(X_test)
rf_cpu_time = round(time.time() - rf_start_time, 2)
print(f'CPU time: {rf_cpu_time} seconds')
# Evaluate the model
rf_cv_score = round(rf_grid_search.best_score_*100,2)
rf_accuracy = round(rf_grid_search.score(X_test, y_test)*100,2)
rf_best_classifier = rf_grid_search.best_estimator_
rf_MSE = mean_squared_error(y_test, rf_y_predicted)
print(f'Best classifier {rf_best_classifier}')
print(f'Random Forest Accuracy Rate (CV score): {rf_cv_score}%')
print(f'Random Forest Accuracy Rate (Test score): {rf_accuracy}%')
print(f'Random Forest MSE: {rf_MSE}')
CPU time: 762.19 seconds
Best classifier Pipeline(steps=[('smote', SMOTE(random_state=11)),
['classifier',
RandomForestClassifier(max_depth=3, min_samples_leaf=2,
random_state=42)]])
Random Forest Accuracy Rate (CV score): 73.05%
Random Forest Accuracy Rate (Test score): 74.67%
Random Forest MSE: 0.36008502182142904
rf_results = rf_grid_search.fit(X_train, y_train)
metrics.plot_roc_curve(rf_results, X_test, y_test)
rf_predict_prob = rf_grid_search.predict_proba(X_test)
rf_model_probs = rf_predict_prob[:, 1]
rf_roc_auc = round(roc_auc_score(y_test, rf_model_probs),2)
print(f'Random Forest AUC Score: {rf_roc_auc}')
Random Forest AUC Score: 0.75
adaboost_start_time = time.time()
adaboost_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['classifier', AdaBoostClassifier()]])
adaboost_param_grid = {'classifier__n_estimators': [5, 10, 25],
'classifier__learning_rate': [0.05, 0.1, 0.25, 0.5]}
adaboost_stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,random_state=11)
# Run the model
adaboost_grid_search = GridSearchCV(estimator=adaboost_pipeline,
param_grid=adaboost_param_grid,
scoring='roc_auc',
cv=adaboost_stratified_kfold,
n_jobs=-1)
adaboost_grid_search.fit(X_train, y_train)
adaboost_y_predicted = adaboost_grid_search.predict(X_test)
adaboost_cpu_time = round(time.time() - adaboost_start_time, 2)
print(f'CPU time: {adaboost_cpu_time} seconds')
# Evaluate the model
adaboost_cv_score = round(adaboost_grid_search.best_score_*100,2)
adaboost_accuracy = round(adaboost_grid_search.score(X_test, y_test)*100,2)
adaboost_best_classifier = adaboost_grid_search.best_estimator_
adaboost_MSE = mean_squared_error(y_test, adaboost_y_predicted)
print(f'Best classifier {adaboost_best_classifier}')
print(f'Ada Boost Accuracy Rate (CV score): {adaboost_cv_score}%')
print(f'Ada Boost Accuracy Rate (Test score): {adaboost_accuracy}%')
print(f'Ada Boost MSE: {adaboost_MSE}')
CPU time: 52.63 seconds
Best classifier Pipeline(steps=[('smote', SMOTE(random_state=11)),
['classifier',
AdaBoostClassifier(learning_rate=0.5, n_estimators=10)]])
Ada Boost Accuracy Rate (CV score): 72.96%
Ada Boost Accuracy Rate (Test score): 72.96%
Ada Boost MSE: 0.23548277553484512
adaboost_results = adaboost_grid_search.fit(X_train, y_train)
metrics.plot_roc_curve(adaboost_results, X_test, y_test)
adaboost_predict_prob = adaboost_grid_search.predict_proba(X_test)
adaboost_model_probs = adaboost_predict_prob[:, 1]
adaboost_roc_auc = round(roc_auc_score(y_test, adaboost_model_probs),2)
print(f'Ada Boosting AUC Score: {adaboost_roc_auc}')
Ada Boosting AUC Score: 0.73
gboost_start_time = time.time()
gboost_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['classifier', GradientBoostingClassifier()]])
gboost_param_grid = {'classifier__n_estimators': [10, 50],
'classifier__learning_rate': [0.1, 0.2, 0.25, 0.5]}
gboost_stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,random_state=11)
# Run the model
gboost_grid_search = GridSearchCV(estimator=gboost_pipeline,
param_grid=gboost_param_grid,
scoring='roc_auc',
cv=gboost_stratified_kfold,
n_jobs=-1)
gboost_results = gboost_grid_search.fit(X_train, y_train)
gboost_y_predicted = gboost_grid_search.predict(X_test)
gboost_cpu_time = round(time.time() - gboost_start_time, 2)
print(f'CPU time: {gboost_cpu_time} seconds')
# Evaluate the model
gboost_cv_score = round(gboost_grid_search.best_score_*100,2)
gboost_accuracy = round(gboost_grid_search.score(X_test, y_test)*100,2)
gboost_best_classifier = gboost_grid_search.best_estimator_
gboost_MSE = mean_squared_error(y_test, gboost_y_predicted)
print(f'Best classifier {gboost_best_classifier}')
print(f'Gradient Boost Accuracy Rate (CV score): {gboost_cv_score}%')
print(f'Gradient Boost Accuracy Rate (Test score): {gboost_accuracy}%')
print(f'Gradient Boost MSE: {gboost_MSE}')
CPU time: 154.17 seconds
Best classifier Pipeline(steps=[('smote', SMOTE(random_state=11)),
['classifier', GradientBoostingClassifier(n_estimators=50)]])
Gradient Boost Accuracy Rate (CV score): 72.09%
Gradient Boost Accuracy Rate (Test score): 75.52%
Gradient Boost MSE: 0.25673823089210024
metrics.plot_roc_curve(gboost_results, X_test, y_test)
gboost_predict_prob = gboost_grid_search.predict_proba(X_test)
gboost_model_probs = gboost_predict_prob[:, 1]
gboost_roc_auc = round(roc_auc_score(y_test, gboost_model_probs),2)
print(f'Gradient Boosting AUC Score: {gboost_roc_auc}')
Gradient Boosting AUC Score: 0.76
model_performance_dict = {'Algorithm':['Logistic Regression', 'Decision Tree', 'Random Forest', 'Ada Boosting', 'Gradient Boosting'],
'CPU time(seconds)': [logreg_cpu_time, dt_cpu_time, rf_cpu_time, adaboost_cpu_time, gboost_cpu_time],
'CV Score (%)': [logreg_cv_score, dt_cv_score, rf_cv_score, adaboost_cv_score, gboost_cv_score],
'Test Score/Accuracy (%)': [logreg_accuracy, dt_accuracy, rf_accuracy, adaboost_accuracy, gboost_accuracy],
'MSE': [logreg_MSE, dt_MSE, rf_MSE, adaboost_MSE, gboost_MSE],
'AUC':[logreg_roc_auc, dt_roc_auc, rf_roc_auc, adaboost_roc_auc ,gboost_roc_auc]}
model_performance_df = pd.DataFrame.from_dict(model_performance_dict)
model_performance_df
| Algorithm | CPU time(seconds) | CV Score (%) | Test Score/Accuracy (%) | MSE | AUC | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 14.36 | 72.62 | 72.41 | 0.387579 | 0.72 |
| 1 | Decision Tree | 146.89 | 70.52 | 71.94 | 0.203210 | 0.72 |
| 2 | Random Forest | 762.19 | 73.05 | 74.67 | 0.360085 | 0.75 |
| 3 | Ada Boosting | 52.63 | 72.96 | 72.96 | 0.235483 | 0.73 |
| 4 | Gradient Boosting | 154.17 | 72.09 | 75.52 | 0.256738 | 0.76 |
Random Forest showed the highest accuracy at 74.67%. However, Gradient Boosting showed the highest AUC score (0.76). For some business problems, choosing the best model based on the accuracy rate might be the most appropriate approach. However, for flight cancellation problems with severe class imbalance, it is more appropriate to choose the best performing model based on the AUC score, because it indicates the ability of a classifier to distinguish between classes. When the AUC is low, it means that the classifiers only make predictions randomly without actually having the capability to differentiate the classes. Even though the AUC score of these prediction models was not excellent (>0.8), it is still acceptable to deploy the model into production because at this rate, the model is considered to be sensitive to cancellation and delays. In other words, the model tends to classify a flight as a cancelled flight when it is not, which is better than failing to spot a cancelled or delayed flight.
In terms of CPU time, which denoted the time elapsed of running the model, Random Forest ran longer than the other algorithms (762.19 seconds), resulting in relatively high AUC (0.75), but low accuracy rate (71.94%). Meanwhile, the least performing algorithm was Logistics Regression, showing the lowest AUC (0.72) and lowest accuracy rate (72.41%). Although it requires the shortest time to run, the result is not as good as the other classifiers.
Overall, we can conclude that different algorithms might result in different performances. As we can measure model performance by various parameters, it is highly important to thoroughly understand the problem to correctly evaluate the model and draw conclusions out of it.